In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
In [2]:
dfs = pd.read_excel("input_data.xlsx")
In [3]:
dfs.head(20)
Out[3]:
Short description Description Caller Assignment group
0 login issue -verified user details.(employee# & manager name)_x000D_\n-checked the user name in ad and reset the password._x000D_\n-advised the user to login and check._x000D_\n-caller confirmed that he was able to login._x000D_\n-issue resolved. spxjnwir pjlcoqds GRP_0
1 outlook _x000D_\n_x000D_\nreceived from: hmjdrvpb.komuaywn@gmail.com_x000D_\n_x000D_\nhello team,_x000D_\n_x000D_\nmy meetings/skype meetings etc are not appearing in my outlook calendar, can somebody please advise how to correct this?_x000D_\n_x000D_\nkind hmjdrvpb komuaywn GRP_0
2 cant log in to vpn _x000D_\n_x000D_\nreceived from: eylqgodm.ybqkwiam@gmail.com_x000D_\n_x000D_\nhi_x000D_\n_x000D_\ni cannot log on to vpn_x000D_\n_x000D_\nbest eylqgodm ybqkwiam GRP_0
3 unable to access hr_tool page unable to access hr_tool page xbkucsvz gcpydteq GRP_0
4 skype error skype error owlgqjme qhcozdfx GRP_0
5 unable to log in to engineering tool and skype unable to log in to engineering tool and skype eflahbxn ltdgrvkz GRP_0
6 event: critical:HostName_221.company.com the value of mountpoint threshold for /oracle/SID_37/erpdata21/ event: critical:HostName_221.company.com the value of mountpoint threshold for /oracle/SID_37/erpdata21/sr3psa1d_7/sr3psa1d.data7,perpsr3psa1d,4524 is 98 jyoqwxhz clhxsoqy GRP_1
7 ticket_no1550391- employment status - new non-employee [enter user's name] ticket_no1550391- employment status - new non-employee [enter user's name] eqzibjhw ymebpoih GRP_0
8 unable to disable add ins on outlook unable to disable add ins on outlook mdbegvct dbvichlg GRP_0
9 ticket update on inplant_874773 ticket update on inplant_874773 fumkcsji sarmtlhy GRP_0
10 engineering tool says not connected and unable to submit reports engineering tool says not connected and unable to submit reports badgknqs xwelumfz GRP_0
11 hr_tool site not loading page correctly hr_tool site not loading page correctly dcqsolkx kmsijcuz GRP_0
12 unable to login to hr_tool to sgxqsuojr xwbesorf cards unable to login to hr_tool to sgxqsuojr xwbesorf cards oblekmrw qltgvspb GRP_0
13 user wants to reset the password user wants to reset the password iftldbmu fujslwby GRP_0
14 unable to open payslips unable to open payslips epwyvjsz najukwho GRP_0
15 ticket update on inplant_874743 ticket update on inplant_874743 fumkcsji sarmtlhy GRP_0
16 unable to login to company vpn \n\nreceived from: xyz@company.com\n\nhi,\n\ni am unable to login to company vpn website, trying to open a new session using the below link, but not able to get through pls help urgently as we are working from home tomorrow due to month end closing. chobktqj qdamxfuc GRP_0
17 when undocking pc , screen will not come back when undocking pc , screen will not come back sigfdwcj reofwzlm GRP_3
18 erp SID_34 account locked erp SID_34 account locked nqdyowsm yqerwtna GRP_0
19 unable to sign into vpn unable to sign into vpn ftsqkvre bqzrupic GRP_0
In [4]:
dfs.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8500 entries, 0 to 8499
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Short description  8492 non-null   object
 1   Description        8499 non-null   object
 2   Caller             8500 non-null   object
 3   Assignment group   8500 non-null   object
dtypes: object(4)
memory usage: 265.8+ KB
In [5]:
dfs["Assignment group"].nunique()
Out[5]:
74
In [6]:
dfs["Assignment group"].value_counts()
Out[6]:
GRP_0     3976
GRP_8      661
GRP_24     289
GRP_12     257
GRP_9      252
GRP_2      241
GRP_19     215
GRP_3      200
GRP_6      184
GRP_13     145
GRP_10     140
GRP_5      129
GRP_14     118
GRP_25     116
GRP_33     107
GRP_4      100
GRP_29      97
GRP_18      88
GRP_16      85
GRP_17      81
GRP_31      69
GRP_7       68
GRP_34      62
GRP_26      56
GRP_40      45
GRP_28      44
GRP_41      40
GRP_30      39
GRP_15      39
GRP_42      37
GRP_20      36
GRP_45      35
GRP_22      31
GRP_1       31
GRP_11      30
GRP_21      29
GRP_47      27
GRP_62      25
GRP_48      25
GRP_23      25
GRP_60      20
GRP_39      19
GRP_27      18
GRP_37      16
GRP_44      15
GRP_36      15
GRP_50      14
GRP_65      11
GRP_53      11
GRP_52       9
GRP_55       8
GRP_51       8
GRP_49       6
GRP_46       6
GRP_59       6
GRP_43       5
GRP_66       4
GRP_32       4
GRP_68       3
GRP_63       3
GRP_38       3
GRP_58       3
GRP_56       3
GRP_57       2
GRP_54       2
GRP_69       2
GRP_71       2
GRP_72       2
GRP_61       1
GRP_64       1
GRP_67       1
GRP_35       1
GRP_70       1
GRP_73       1
Name: Assignment group, dtype: int64
In [7]:
plt.subplots(figsize = (20,5))
plt.xticks(rotation=90)
plt.plot(dfs["Assignment group"].value_counts())
plt.xlabel("Assignment group")
plt.ylabel("count")
plt.grid()
plt.show()
In [8]:
dfs.isnull().sum()
Out[8]:
Short description    8
Description          1
Caller               0
Assignment group     0
dtype: int64
In [9]:
dfs[dfs.isnull().any(axis=1)]
Out[9]:
Short description Description Caller Assignment group
2604 NaN _x000D_\n_x000D_\nreceived from: ohdrnswl.rezuibdt@gmail.com_x000D_\n_x000D_\nhi,_x000D_\n_x000D_\n_x000D_\n_x000D_\nlink is not working. kindly resolve yhe issue on urgent basis._x000D_\n_x000D_\n_x000D_\n_x000D_\nbest ohdrnswl rezuibdt GRP_34
3383 NaN _x000D_\n-connected to the user system using teamviewer._x000D_\n-help the user login to the portal._x000D_\n-issue resolved. qftpazns fxpnytmk GRP_0
3906 NaN -user unable tologin to vpn._x000D_\n-connected to the user system using teamviewer._x000D_\n-help the user login to the company vpn using the vpn company vpn link._x000D_\n-issue resolved. awpcmsey ctdiuqwe GRP_0
3910 NaN -user unable tologin to vpn._x000D_\n-connected to the user system using teamviewer._x000D_\n-help the user login to the company vpn using the vpn company vpn link._x000D_\n-issue resolved. rhwsmefo tvphyura GRP_0
3915 NaN -user unable tologin to vpn._x000D_\n-connected to the user system using teamviewer._x000D_\n-help the user login to the company vpn using the vpn company vpn link._x000D_\n-issue resolved. hxripljo efzounig GRP_0
3921 NaN -user unable tologin to vpn._x000D_\n-connected to the user system using teamviewer._x000D_\n-help the user login to the company vpn using the vpn company vpn link._x000D_\n-issue resolved. cziadygo veiosxby GRP_0
3924 NaN name:wvqgbdhm fwchqjor\nlanguage:\nbrowser:microsoft internet explorer\nemail:wvqgbdhm.fwchqjor@gmail.com\ncustomer number:\ntelephone:-not available\nsummary:can't get into vpn - need to be on at 4:30 est and it wont' happen please help aerp! wvqgbdhm fwchqjor GRP_0
4341 NaN _x000D_\n_x000D_\nreceived from: eqmuniov.ehxkcbgj@gmail.com_x000D_\n_x000D_\ngood morning,_x000D_\n_x000D_\nwhen trying to log on to erp i get this error below._x000D_\nplease help urgently as i can not process customer order._x000D_\n_x000D_\n[cid:image001.png@01d20f2e.751db880]_x000D_\n_x000D_\n_x000D_\n eqmuniov ehxkcbgj GRP_0
4395 i am locked out of skype NaN viyglzfo ajtfzpkb GRP_0
In [10]:
duplicate_df=dfs[dfs.duplicated(subset = None, keep = 'first')]
In [11]:
duplicate_df.shape
Out[11]:
(83, 4)
In [12]:
duplicate_df
Out[12]:
Short description Description Caller Assignment group
51 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
229 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
493 ticket update on inplant_872730 ticket update on inplant_872730 fumkcsji sarmtlhy GRP_0
512 blank call //gso blank call //gso rbozivdq gmlhrtvp GRP_0
667 job bkbackup_tool_powder_prod_full failed in job_scheduler at: 10/22/2016 18:33:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob bkbackup_tool_powder_prod_full failed in job_scheduler at: 10/22/2016 18:33:00 bpctwhsn kzqsbmtp GRP_8
724 blank call blank call rbozivdq gmlhrtvp GRP_0
1064 job Job_1967d failed in job_scheduler at: 10/19/2016 00:02:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_1967d failed in job_scheduler at: 10/19/2016 00:02:00 bpctwhsn kzqsbmtp GRP_8
1125 blank call blank call rbozivdq gmlhrtvp GRP_0
1744 phone issue phone issue gzjtweph mnslwfqv GRP_0
1851 reset passwords for fylrosuk kedgmiul using password_management_tool password reset. the fylrosuk kedgmiul GRP_17
1982 call came and got disconnected call came and got disconnected rbozivdq gmlhrtvp GRP_0
2000 job Job_549 failed in job_scheduler at: 10/07/2016 23:05:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_549 failed in job_scheduler at: 10/07/2016 23:05:00 bpctwhsn kzqsbmtp GRP_8
2061 blank call // loud noise // gso blank call // loud noise // gso rbozivdq gmlhrtvp GRP_0
2141 blank call blank call rbozivdq gmlhrtvp GRP_0
2533 reset passwords for qwsjptlo hnlasbed using password_management_tool password reset. the goaxzsql qpjnbgsa GRP_17
2554 reset passwords for bxeagsmt zrwdgsco using password_management_tool password reset. the bxeagsmt zrwdgsco GRP_17
2683 ticket update ticket update pbhmwqtz wqlbudjx GRP_0
2714 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
2720 german call german call ayrhcfxi zartupsw GRP_0
2789 blank call blank call pwkrlqbc zslqfmka GRP_0
2875 blank call blank call pwkrlqbc zslqfmka GRP_0
2876 blank call blank call pwkrlqbc zslqfmka GRP_0
3085 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
3219 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
3619 call came and got disconnected call came and got disconnected rbozivdq gmlhrtvp GRP_0
3637 blank call blank call fumkcsji sarmtlhy GRP_0
3647 答复: 答复: order products online problem _x000D_\n_x000D_\nreceived from: fkdazsmi.yecbrofv@gmail.com_x000D_\n_x000D_\nhello,_x000D_\ni will ask your help if i can't solve it with the help of my boss tomorrow. fkdazsmi yecbrofv GRP_0
3693 reset passwords for mvhcoqed konjdmwq using password_management_tool password reset. the mvhcoqed konjdmwq GRP_17
3908 vpn not working- vpn.company.com link is giving error vpn not working- vpn.company.com link is giving error kailyenh zfyvkopr GRP_0
4094 job Job_2883 failed in job_scheduler at: 09/18/2016 00:02:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_2883 failed in job_scheduler at: 09/18/2016 00:02:00 bpctwhsn kzqsbmtp GRP_8
4229 not able to access -inq industrial (-inq.industrial@company.com) _x000D_\n_x000D_\nreceived from: muqdlobv.qflsdahg@gmail.com_x000D_\n_x000D_\nteam,_x000D_\n_x000D_\ni am not able access to -inq industrial (-inq.industrial@company.com<mailto:-inq.industrial@company.com>) for which we receive enquiries about special products._x000D_\nfrom past one week above mentioned shared email box is not functioning for me, whereas other members are able to access the same._x000D_\nso i kindly request you to take it upon high priority and confirm back._x000D_\n_x000D_\n muqdlobv qflsdahg GRP_0
4273 blank call blank call pwkrlqbc zslqfmka GRP_0
4303 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
4361 account locked in ad account locked in ad gvsabjhq cgwsbiep GRP_0
4495 job SID_37hoti failed in job_scheduler at: 09/13/2016 19:53:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob SID_37hoti failed in job_scheduler at: 09/13/2016 19:53:00 bpctwhsn kzqsbmtp GRP_5
4530 blank call blank call fumkcsji sarmtlhy GRP_0
4550 call disconnected due to vpn disconnection call disconnected due to vpn disconnection rbozivdq gmlhrtvp GRP_0
4704 private address fields are enabled on employee master crm ui disable private address fields, new & edit buttons on employee master crm ui tavsikpl dcrkwuny GRP_15
4881 install company barcode für ewew8323504 \vzqomdgt jwoqbuml install company barcode für ewew8323504 \vzqomdgt jwoqbuml vzqomdgt jwoqbuml GRP_24
4984 reset passwords for cubdsrml znewqgop using password_management_tool password reset. the cubdsrml znewqgop GRP_17
4991 reset passwords for davidthd robankm using password_management_tool password reset. the zelunfcq yimdwjrp GRP_17
5212 blank call blank call fumkcsji sarmtlhy GRP_0
5226 blank call blank call olckhmvx pcqobjnd GRP_0
5317 reset passwords for bxeagsmt zrwdgsco using password_management_tool password reset. the bxeagsmt zrwdgsco GRP_17
5488 job SID_38hotf failed in job_scheduler at: 09/01/2016 22:20:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob SID_38hotf failed in job_scheduler at: 09/01/2016 22:20:00 bpctwhsn kzqsbmtp GRP_8
5521 blank call //gso blank call //gso rbozivdq gmlhrtvp GRP_0
5708 reset passwords for bxeagsmt zrwdgsco using password_management_tool password reset. the bxeagsmt zrwdgsco GRP_17
5884 reset passwords for bxeagsmt zrwdgsco using password_management_tool password reset. the bxeagsmt zrwdgsco GRP_17
5928 ticket update on inplant_855239 ticket update on inplant_855239 fumkcsji sarmtlhy GRP_0
5945 blank call //gso blank call //gso rbozivdq gmlhrtvp GRP_0
6058 reset passwords for bxeagsmt zrwdgsco using password_management_tool password reset. the bxeagsmt zrwdgsco GRP_17
6130 job Job_749 failed in job_scheduler at: 08/27/2016 15:29:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_749 failed in job_scheduler at: 08/27/2016 15:29:00 bpctwhsn kzqsbmtp GRP_8
6141 job Job_1989 failed in job_scheduler at: 08/27/2016 11:24:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_1989 failed in job_scheduler at: 08/27/2016 11:24:00 bpctwhsn kzqsbmtp GRP_6
6252 job Job_3028 failed in job_scheduler at: 08/26/2016 00:02:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/26/2016 00:02:00 bpctwhsn kzqsbmtp GRP_8
6260 job Job_3028 failed in job_scheduler at: 08/25/2016 22:02:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/25/2016 22:02:00 bpctwhsn kzqsbmtp GRP_8
6265 job pp_EU_tool_netch_ap1 failed in job_scheduler at: 08/25/2016 16:14:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob pp_EU_tool_netch_ap1 failed in job_scheduler at: 08/25/2016 16:14:00 bpctwhsn kzqsbmtp GRP_8
6321 job Job_1314 failed in job_scheduler at: 08/25/2016 08:15:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_1314 failed in job_scheduler at: 08/25/2016 08:15:00 bpctwhsn kzqsbmtp GRP_60
6323 job Job_1314 failed in job_scheduler at: 08/25/2016 08:15:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_1314 failed in job_scheduler at: 08/25/2016 08:15:00 bpctwhsn kzqsbmtp GRP_60
6340 probleme mit erpgui \vsdtxwry ngkcdjye probleme mit erpgui \vsdtxwry ngkcdjye vsdtxwry ngkcdjye GRP_24
6411 svc-now ticket found... doing nothing received from: monitoring_tool@company.com_x000D_\n_x000D_\nsvc-now ticket found... doing nothing bpctwhsn kzqsbmtp GRP_60
6412 svc-now ticket found... doing nothing received from: monitoring_tool@company.com_x000D_\n_x000D_\nsvc-now ticket found... doing nothing bpctwhsn kzqsbmtp GRP_60
6471 job SID_41arc2 failed in job_scheduler at: 08/24/2016 07:04:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob SID_41arc2 failed in job_scheduler at: 08/24/2016 07:04:00 bpctwhsn kzqsbmtp GRP_8
6485 job SID_31arc2 failed in job_scheduler at: 08/24/2016 06:09:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob SID_31arc2 failed in job_scheduler at: 08/24/2016 06:09:00 bpctwhsn kzqsbmtp GRP_8
6521 job Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 bpctwhsn kzqsbmtp GRP_8
6522 job Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 bpctwhsn kzqsbmtp GRP_8
6523 job Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 bpctwhsn kzqsbmtp GRP_8
6524 job Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 bpctwhsn kzqsbmtp GRP_8
6603 account unlock account unlock jusenflm sufbehom GRP_0
6659 job Job_3028 failed in job_scheduler at: 08/23/2016 00:02:00 received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/23/2016 00:02:00 bpctwhsn kzqsbmtp GRP_8
6739 blank call // gso blank call // gso rbozivdq gmlhrtvp GRP_0
6819 reset passwords for wvdxnkhf jirecvta using password_management_tool password reset. the wvdxnkhf jirecvta GRP_17
6942 call came and got disconnected call came and got disconnected rbozivdq gmlhrtvp GRP_0
6992 probleme mit erpgui \tmqfjard qzhgdoua probleme mit erpgui \tmqfjard qzhgdoua tmqfjard qzhgdoua GRP_24
7034 blank call blank call fumkcsji sarmtlhy GRP_0
7132 reset passwords for ezrsdgfc hofgvwel using password_management_tool password reset. the ezrsdgfc hofgvwel GRP_17
7459 account locked in ad account locked in ad upiyobvj lwohuizr GRP_0
7756 german call german call rbozivdq gmlhrtvp GRP_0
7772 blank call // loud noise blank call // loud noise rbozivdq gmlhrtvp GRP_0
7836 probleme mit erpgui \tmqfjard qzhgdoua probleme mit erpgui \tmqfjard qzhgdoua tmqfjard qzhgdoua GRP_24
8051 issue on pricing in distributor_tool we have agreed price with many of the distributors for a given period and skus. this is specified through pricing condition zcnc in erp. in distributor_tool, it is order through sold to & ship to combination._x000D_\n- till august – flat rate deployment, we did not have any issues._x000D_\n- today, when the distributor tried to book the order with zcnc pricing condition. the initial screen shows the correct price. but when the item is selected and quick order is clicked on, the price is getting changed to list price less standard discount instead of retaining the zcnc price. hbmwlprq ilfvyodx GRP_21
8093 reset passwords for prgthyuulla ramdntythanjesh using password_management_tool password reset. the boirqctx bkijgqry GRP_17
8347 blank call // loud noise blank call // loud noise rbozivdq gmlhrtvp GRP_0
8405 unable to launch outlook unable to launch outlook wjtzrmqc ikqpbflg GRP_0
In [13]:
#drop NaNs and duplicate rows
dfs.dropna(inplace=True)
In [14]:
dfs.shape
Out[14]:
(8491, 4)
In [15]:
dfs.drop_duplicates(subset = None, keep = 'first',inplace=True)
In [16]:
dfs.shape
Out[16]:
(8408, 4)
In [17]:
#preprocess and clean the data
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

#define a function for preprocessing the data
def preprocess_text(df, column_name=''):
    
    # Remove email Ids
    df[column_name] = df[column_name].apply(lambda x: re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,4}',' ',x))
    
    # Remove label url link
    df[column_name]=df[column_name].apply(lambda x: re.sub(r'urlLink|urllink','',x))
    
    # remove all the places where that starts with http or https
    df[column_name]=df[column_name].apply(lambda x: re.sub(r'https?\S+','',x))   
    
    # Strip unwanted spaces
    df[column_name] = df[column_name].apply(lambda x: x.strip())
    
    # Select only alphabets
    df[column_name] = df[column_name].apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))
  
    # Convert text to lowercase
    df[column_name] = df[column_name].apply(lambda x: x.lower())

    # Remove stopwords
    df[column_name] = df[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

    #Remove hello
    df[column_name] = df[column_name].apply(lambda x: re.sub(r'hello ', ' ', x))

    # Replace empty strings with Null
    df[column_name].replace('', np.nan, inplace = True)
    
    # Drop Null values
    df = df.dropna()

    return df
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dgupta3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [18]:
dfs = preprocess_text(dfs, column_name='Description')
In [19]:
dfs = preprocess_text(dfs, column_name='Short description')
In [20]:
dfs.head(20)
Out[20]:
Short description Description Caller Assignment group
0 login issue verified user details employee manager name x checked user name ad reset password x advised user login check x caller confirmed able login x issue resolved spxjnwir pjlcoqds GRP_0
1 outlook x x received x x team x x meetings skype meetings etc appearing outlook calendar somebody please advise correct x x kind hmjdrvpb komuaywn GRP_0
2 cant log vpn x x received x x hi x x cannot log vpn x x best eylqgodm ybqkwiam GRP_0
3 unable access hr tool page unable access hr tool page xbkucsvz gcpydteq GRP_0
4 skype error skype error owlgqjme qhcozdfx GRP_0
5 unable log engineering tool skype unable log engineering tool skype eflahbxn ltdgrvkz GRP_0
6 event critical hostname company com value mountpoint threshold oracle sid erpdata event critical hostname company com value mountpoint threshold oracle sid erpdata sr psa sr psa data perpsr psa jyoqwxhz clhxsoqy GRP_1
7 ticket employment status new non employee enter user name ticket employment status new non employee enter user name eqzibjhw ymebpoih GRP_0
8 unable disable add ins outlook unable disable add ins outlook mdbegvct dbvichlg GRP_0
9 ticket update inplant ticket update inplant fumkcsji sarmtlhy GRP_0
10 engineering tool says connected unable submit reports engineering tool says connected unable submit reports badgknqs xwelumfz GRP_0
11 hr tool site loading page correctly hr tool site loading page correctly dcqsolkx kmsijcuz GRP_0
12 unable login hr tool sgxqsuojr xwbesorf cards unable login hr tool sgxqsuojr xwbesorf cards oblekmrw qltgvspb GRP_0
13 user wants reset password user wants reset password iftldbmu fujslwby GRP_0
14 unable open payslips unable open payslips epwyvjsz najukwho GRP_0
15 ticket update inplant ticket update inplant fumkcsji sarmtlhy GRP_0
16 unable login company vpn received hi unable login company vpn website trying open new session using link able get pls help urgently working home tomorrow due month end closing chobktqj qdamxfuc GRP_0
17 undocking pc screen come back undocking pc screen come back sigfdwcj reofwzlm GRP_3
18 erp sid account locked erp sid account locked nqdyowsm yqerwtna GRP_0
19 unable sign vpn unable sign vpn ftsqkvre bqzrupic GRP_0
In [21]:
#check the distribution of words in each sentence
word_count_SD = dfs['Short description'].apply(lambda x: len(x.split()))

sns.displot(word_count_SD)
Out[21]:
<seaborn.axisgrid.FacetGrid at 0x7fcca844d130>
In [22]:
word_count_Des = dfs['Description'].apply(lambda x: len(x.split()))

sns.displot(word_count_Des)
Out[22]:
<seaborn.axisgrid.FacetGrid at 0x7fccc868a6d0>
In [23]:
!pip install wordcloud
import heapq
from wordcloud import WordCloud
## function to create Word Cloud
def show_wordcloud(data, title):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=100,
        max_font_size=40, 
        scale=3,
        random_state=1 
    ).generate(str(data))

    type(wordcloud.words_.keys())
    #print("Top words are: ", heapq.nlargest(100, wordcloud.words_.keys()))
    print("Top words are: ", list(wordcloud.words_.keys())[0: 10])
    
    fig = plt.figure(1, figsize=(15, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()
Requirement already satisfied: wordcloud in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (1.8.2.2)
Requirement already satisfied: matplotlib in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from wordcloud) (3.5.1)
Requirement already satisfied: pillow in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from wordcloud) (9.0.1)
Requirement already satisfied: numpy>=1.6.1 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from wordcloud) (1.21.5)
Requirement already satisfied: kiwisolver>=1.0.1 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (1.3.2)
Requirement already satisfied: fonttools>=4.22.0 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (4.25.0)
Requirement already satisfied: python-dateutil>=2.7 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (2.8.2)
Requirement already satisfied: packaging>=20.0 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (21.3)
Requirement already satisfied: pyparsing>=2.2.1 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (3.0.4)
Requirement already satisfied: cycler>=0.10 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (0.11.0)
Requirement already satisfied: six>=1.5 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
In [24]:
topGroups = dfs["Assignment group"].value_counts().head(10)

for items in topGroups.iteritems():
    groupName = items[0]
    text_Str1 = dfs['Short description'][dfs['Assignment group'].isin([groupName])].tolist()
    show_wordcloud(text_Str1, groupName + " WORD CLOUD" )
    print("\n")
    
Top words are:  ['erp', "password reset'", 'outlook', 'erp sid', "account locked'", 'ticket update', 'password reset', "issue'", 'access', 'user']

Top words are:  ['failed job', "job scheduler'", "scheduler' job", 'job failed', 'network outage', 'company', "since et'", 'job scheduler', 'circuit outage', 'abended job']

Top words are:  ['probleme mit', 'f r', 'setup new', 'new ws', 'ewew', 'rechner', 'install', 'defekt', 'eu tool', 'drucker']

Top words are:  ['hostname', 'server', 'access', "hostname'", 'space consumed', 'consumed space', 'disk free', 'file', 'folder', 'average samples']

Top words are:  ['failed job', "job scheduler'", 'job failed', "scheduler' job", 'abended job', 'job scheduler', "scheduler job'", "job' abended", 'bobj', 'report']

Top words are:  ['user', 'account', 'erp access', 'sid', 'need', 'erp', 'security incidents', 'please', "access issue'", 'password']

Top words are:  ['laptop', "issue'", "working'", 'unable', 'printer', 'need', 'new', "laptop'", 'system', 'id']

Top words are:  ['pc', 'cannot', 'monitor', 'print', 'printer', 'need', 'new', 'get', 'unable', 'client']

Top words are:  ['failed job', "job scheduler'", "scheduler' job", 'job failed', 'job scheduler', 'abended job', 'snp heu', "scheduler job'", 'create delivery', "job' abended"]

Top words are:  ['inwarehouse tool', 'order', 'erp', 'sale', 'customer', 'unable', 'please', 'error', 'mm', 'price']

In [25]:
for items in topGroups.iteritems():
    groupName = items[0]
    text_Str1 = dfs['Description'][dfs['Assignment group'].isin([groupName])].tolist()
    show_wordcloud(text_Str1, groupName + " WORD CLOUD" )
    print("\n")
Top words are:  ['x x', 'x received', 'received x', "x x'", 'outlook', 'erp sid', 'access', 'need', "password reset'", 'x please']

Top words are:  ['x x', 'yes na', 'na x', 'backup circuit', 'received x', 'x job', 'failed job', "job scheduler'", 'telecom vendor', 'na yes']

Top words are:  ['probleme mit', 'f r', 'ewew', 'setup new', 'new ws', 'x x', 'install', 'und', 'rechner', 'defekt']

Top words are:  ['x x', 'hostname', 'server', 'asa deny', 'deny tcp', 'tcp src', 'src inside', 'inside dst', 'dst outside', 'outside access']

Top words are:  ['x x', 'received x', 'job job', 'x job', 'failed job', "job scheduler'", 'job failed', "scheduler' received", 'report', 'x abended']

Top words are:  ['x x', 'f e', 'e f', 'e e', 'sid sid', 'f f', 'x event', 'e c', 'asa deny', 'deny tcp']

Top words are:  ['x x', 'laptop', 'x received', 'please', 'received x', 'system', 'x sep', 'issue', 'working', 'name']

Top words are:  ['x x', 'x', 'need', 'pc', 'x sep', 'sep asa', 'inside x', 'user', 'please', 'printer']

Top words are:  ['x x', 'received x', 'job job', 'failed job', "job scheduler'", "scheduler' received", 'job failed', 'plant', 'x abended', 'abended job']

Top words are:  ['x x', 'inwarehouse tool', 'order', 'customer', 'item', 'erp', 'x please', 'issue', 'received x', "x x'"]

In [26]:
!pip install fasttext
import fasttext
lang_model = fasttext.load_model('lid.176.bin')

sentences = ['unable check payslips']
predictions = lang_model.predict(sentences)
print(predictions[0])
#this was detected as french: [['__label__fr']]

def language_detect(sentences):
    return lang_model.predict(sentences)[0]

dfs['language'] = dfs['Short description'].apply(language_detect)

#Language detection did not work using below
#pre-trained models lid.176.bin(126 MB) and lid.176.ftz(917kb)
Requirement already satisfied: fasttext in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (0.9.2)
Requirement already satisfied: numpy in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from fasttext) (1.21.5)
Requirement already satisfied: pybind11>=2.2 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from fasttext) (2.10.0)
Requirement already satisfied: setuptools>=0.7.0 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from fasttext) (61.2.0)
[['__label__fr']]
Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.
In [27]:
dfs.head(20)
Out[27]:
Short description Description Caller Assignment group language
0 login issue verified user details employee manager name x checked user name ad reset password x advised user login check x caller confirmed able login x issue resolved spxjnwir pjlcoqds GRP_0 (__label__en,)
1 outlook x x received x x team x x meetings skype meetings etc appearing outlook calendar somebody please advise correct x x kind hmjdrvpb komuaywn GRP_0 (__label__en,)
2 cant log vpn x x received x x hi x x cannot log vpn x x best eylqgodm ybqkwiam GRP_0 (__label__en,)
3 unable access hr tool page unable access hr tool page xbkucsvz gcpydteq GRP_0 (__label__en,)
4 skype error skype error owlgqjme qhcozdfx GRP_0 (__label__ja,)
5 unable log engineering tool skype unable log engineering tool skype eflahbxn ltdgrvkz GRP_0 (__label__en,)
6 event critical hostname company com value mountpoint threshold oracle sid erpdata event critical hostname company com value mountpoint threshold oracle sid erpdata sr psa sr psa data perpsr psa jyoqwxhz clhxsoqy GRP_1 (__label__en,)
7 ticket employment status new non employee enter user name ticket employment status new non employee enter user name eqzibjhw ymebpoih GRP_0 (__label__en,)
8 unable disable add ins outlook unable disable add ins outlook mdbegvct dbvichlg GRP_0 (__label__en,)
9 ticket update inplant ticket update inplant fumkcsji sarmtlhy GRP_0 (__label__en,)
10 engineering tool says connected unable submit reports engineering tool says connected unable submit reports badgknqs xwelumfz GRP_0 (__label__en,)
11 hr tool site loading page correctly hr tool site loading page correctly dcqsolkx kmsijcuz GRP_0 (__label__en,)
12 unable login hr tool sgxqsuojr xwbesorf cards unable login hr tool sgxqsuojr xwbesorf cards oblekmrw qltgvspb GRP_0 (__label__en,)
13 user wants reset password user wants reset password iftldbmu fujslwby GRP_0 (__label__en,)
14 unable open payslips unable open payslips epwyvjsz najukwho GRP_0 (__label__fr,)
15 ticket update inplant ticket update inplant fumkcsji sarmtlhy GRP_0 (__label__en,)
16 unable login company vpn received hi unable login company vpn website trying open new session using link able get pls help urgently working home tomorrow due month end closing chobktqj qdamxfuc GRP_0 (__label__en,)
17 undocking pc screen come back undocking pc screen come back sigfdwcj reofwzlm GRP_3 (__label__en,)
18 erp sid account locked erp sid account locked nqdyowsm yqerwtna GRP_0 (__label__en,)
19 unable sign vpn unable sign vpn ftsqkvre bqzrupic GRP_0 (__label__en,)
In [ ]: